#pragma once

// Classes:
//    Intel::XEON::PMU: Manages 3 fixed counters and up to 8 programmable counters.
//                      See 'doc/pmu.doc' for details including refs for constants.

#include <assert.h>

#include <sys/types.h>
#include <sched.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>

#include <string>
#include <vector>
#include <iostream>

namespace Intel {

// Credit to Nanobench for documenting this code and who copied it from Google. Me too!
// The purpose of this function is to stop the compiler from optimizing away a result
// (typically of a find/search here) so the outer loop isn't boiled down to a no-op or
// near no-op. It tells the compiler 'value' is needed elsewhere even if the compiler
// doesn't know who/where. Original code location:
//    https://github.com/google/benchmark include/benchmark/benchmark.h#L444
template <class Tp>
inline __attribute__((always_inline)) void DoNotOptimize(Tp& value) {
  asm volatile("" : "+m,r"(value) : : "memory");
}

namespace XEON {

struct PMU {
  // ENUM
  enum ProgCounterSetConfig {
    // +-----------------------------------------------------------------------------------------------+
    // | Intel architecturally significant metrics basic set good for Intel Skylake and later PMUs.    |
    // | All counters count user-code (ring 2) run only. Kernel code is ignored. Counters only see the |
    // | work the CPU HW thread executed it was configured against; see 'printConfiguration()'.        |
    // +-----------------------------------------------------------------------------------------------+
    // | Fixed        Counter 0: https://perfmon-events.intel.com/ -> Number of retired instructions   |
    // | Fixed        Counter 1: https://perfmon-events.intel.com/ -> Cycles elapsed not in halt state |
    // | Fixed        Counter 2: https://perfmon-events.intel.com/ -> Ref cycles elapsed in halt state |
    // | Programmable Counter 0: https://perfmon-events.intel.com/ -> LONGEST_LAT_CACHE.REFERENCE      |
    // | Programmable Counter 1: https://perfmon-events.intel.com/ -> LONGEST_LAT_CACHE.MISS           |
    // | Programmable Counter 2: https://perfmon-events.intel.com/ -> BR_INST_RETIRED.ALL_BRANCHES_PS  |
    // | Programmable Counter 3: https://perfmon-events.intel.com/ -> BR_INST_RETIRED.COND_NTAKEN	     |
    // +-----------------------------------------------------------------------------------------------+
    k_DEFAULT_XEON_CONFIG_0 = 0,
    k_DEFAULT_CONFIG_UNDEFINED = 1,
  };

  enum Support {
    k_FIXED_COUNTERS            = 3,    // All boxes have 3 fixed counters
    k_MAX_PROG_COUNTERS_HT_ON   = 4,    // When CPU hyper threading ON  prog counters 0,1,2,3 available
    k_MAX_PROG_COUNTERS_HT_OFF  = 8,    // When CPU hyper threading OFF prog counters mostly [0-7] available
                                        // See https://perfmon-events.intel.com by event for details
  };

private:
  const u_int32_t IA32_PERF_GLOBAL_STATUS = 0x38e;
  const u_int32_t IA32_PERF_GLOBAL_CTRL   = 0x38f;

  // MSR to configure programmable counter
  const u_int32_t IA32_PERFEVTSEL0      = 0x186;
  const u_int32_t IA32_PERFEVTSEL1      = 0x187;
  const u_int32_t IA32_PERFEVTSEL2      = 0x188;
  const u_int32_t IA32_PERFEVTSEL3      = 0x189;
  const u_int32_t IA32_PERFEVTSEL4      = 0x18a;
  const u_int32_t IA32_PERFEVTSEL5      = 0x18b;
  const u_int32_t IA32_PERFEVTSEL6      = 0x18c;
  const u_int32_t IA32_PERFEVTSEL7      = 0x18d;

  // Initial programmable counter values written here
  const u_int32_t IA32_PMC0             = 0xc1;
  const u_int32_t IA32_PMC1             = 0xc2;
  const u_int32_t IA32_PMC2             = 0xc3;
  const u_int32_t IA32_PMC3             = 0xc4;
  const u_int32_t IA32_PMC4             = 0xc5;
  const u_int32_t IA32_PMC5             = 0xc6;
  const u_int32_t IA32_PMC6             = 0xc7;
  const u_int32_t IA32_PMC7             = 0xc8;

  const u_int32_t IA32_PERF_GLOBAL_STATUS_RESET = 0x390;

  // Initial fixed counter values written here
  const u_int32_t IA32_FIXED_CTR0       = 0x309;
  const u_int32_t IA32_FIXED_CTR1       = 0x30a;
  const u_int32_t IA32_FIXED_CTR2       = 0x30b;

  // MSR to conifgure fixed counters
  const u_int32_t IA32_FIXED_CTR_CTRL   = 0x38d;
  const u_int64_t DEFAULT_FIXED_CONFIG  = 0x222;

  // Overflow masks for programmable counter 0, fixed counter 0
  // The others are generated by left shifting 
  const u_int64_t PMC0_OVERFLOW_MASK      = (1ull<<0);  // 'doc/intel_msr.pdf p287'                                      
  const u_int64_t FIXEDCTR0_OVERFLOW_MASK = (1ull<<32); // 'doc/intel_msr.pdf p287'                                      

  // DATA
  int       d_fid;                             // file handle for MSR read/write
  u_int16_t d_cnt;                             // # programmable counters in use [0, k_MAX_PROG_COUNTERS_HT_OFF)
  u_int64_t d_fcfg;                            // configuration for all fixed counters
  u_int64_t d_pcfg[k_MAX_PROG_COUNTERS_HT_OFF];// configuration for each programmable counter in [0, d_cnt)
  
  // Pretty-print helper data
  std::vector<std::string> d_fixedMnemonic;    // Nickname for fixed counters e.g. 'F3' for counter 3
  std::vector<std::string> d_fixedDescription; // Full description e.g. 'Reference no-halt cycles'

  // Pretty-print helper data
  std::vector<std::string> d_progMnemonic;     // Nickname for programmable counters e.g. 'P3' for counter 3
  std::vector<std::string> d_progDescription;  // Full description e.g. 'LLC cache misses'

public:
  // CREATORS
  PMU() = delete;
    // Default constructor not provided

  explicit PMU(ProgCounterSetConfig config);
    // Create a PMU object to run all fixed counters and programmable counters according to specified enumerated
    // value 'config'. The behavior is defined `config` is compatible for the host PMU hardware and HT (hyper
    // threading) configuration. See `doc/pmu.md` background. Upon return callers should run `reset`. Note this
    // method unconditionally pins the caller's thread to the current, running core. If the thread was already
    // pinned before entry here, or the PID was run taskset, this behavior will have no effect.

  ~PMU();
    // Destroy this object.

  PMU(const PMU& other) = delete;
    // Copy constructor is not supported.

  // ACCESSORS
  int coreId() const;
    // Return the pinned HW core number (zero-based) of the caller.

  u_int16_t fixedCountersDefined() const;
    // Return the number of fixed, distinct counters the `config` set at construction time configured.

  u_int16_t programmableCountersDefined() const;
    // Return the number of fixed, distinct programmable counters the `config` at constrction time configured.

  u_int64_t timeStampCounter() const;
    // Return the current value of 'rdtsc' for this thread's core

  u_int64_t programmableCounterValue(u_int16_t counter) const;
    // Return the current value of the specified programmable 'counter' on the HW core given by 'coreId()'. The
    // behavior is defined provided 'start()' or 'reset()' previously ran without error, and if 'counter' is in
    // the range '0<=counter<programmableCountersDefined()'

  u_int64_t fixedCounterValue(u_int16_t counter) const;
    // Return the current value of the specified fixed 'counter' on the HW core given by 'coreId()'. The behavior
    // is defined provided 'start()' or 'reset()' previously ran without error, and if 'counter' is in the range
    // '0<=counter<fixedCountersDefined()'.

  bool fixedCounterOverflowed(u_int16_t counter) const;
    // Return true if specified fixed 'counter' overflowed and false otherwise. The behavior is defined provided
    // 'start()' or 'reset()' previously ran without error, and if 'counter' is in `[0, fixedCountersDefined()]`.

  bool programmableCounterOverflowed(u_int16_t counter) const;
    // Return true if specified programmable 'counter' overflowed and false otherwise. The behavior is defined provided
    // 'start()' or 'reset()' previously ran without error, and if 'counter' is in `[0, programmableCountersDefined()]`

  const std::vector<std::string>& fixedMnemonic() const;
    // Return a non-modifiable reference to an array of mnemonic names assigned by this class at construction time for
    // the fixed counters.

  const std::vector<std::string>& fixedDescription() const;
    // Return a non-modifiable reference to an array of human readable descritions assigned by this class at
    // construction time for the fixed counters.

  const std::vector<std::string>& programmableMnemonic() const;
    // Return a non-modifiable reference to an array of mnemonic names assigned by this class at construction time
    // for the programmable counters.

  const std::vector<std::string>& programmableDescription() const;
    // Return a non-modifiable reference to an array of human readable descritions assigned by this class at
    // construction time for the programmable counters. 

  // MANIPULATORS
  int reset();
    // Return zero if all counters requested at construction time are stopped, configured, and reset to 0. The counters
    // will not resume counting until 'start()' is called.

  int start();
    // Return 0 if all fixed Skylake counters, and all defined programmable counters defined at construction time 
    // are running and non-zero otherwise. The behavior is defined provided 'reset()' previously ran without error.
    // Counters run until 'reset' is called.

  bool overflow();
    // Return true if any fixed or programmable counter overflowed, and false otherwise.

  PMU& operator=(const PMU& rhs) = delete;
    // Assignment operator not supported

  // ASPECTS
  std::ostream& print(std::ostream& stream) const;
    // Pretty print to stdout a human readable snapshot of counters defined at construction time and their current
    // values with overflow status to specified 'stream'.

private:
  // PRIVATE MANIPULATORS
  int pinToHWCore(int coreId);                                                                                   
    // Return 0 if the the current/caller thread was pinned to 'coreId' and non-zero errno otherwise. Behavior is
    // defined provided 'coreId>=0' and 'coreId' is less than the total number of cores available in the underlying
    // HW as reported by 'cat /proc/cpuinfo'. Note this routine only enforces the minimum bound.

  int overflowStatus(u_int64_t *value) const;
    // Return 0 and write into specified 'value' the contents of the SkyLake IA32_PERF_GLOBAL_STATUS MSR on success and
    // non-zero otherwise. See IR p708 figure 19-10 for interpretation of value.

  int rdmsr(u_int32_t reg, u_int64_t *value);
    // Return 0 if read into specified 'value' the contents of specified MSR 'reg' on the HW-core previously chosen
    // by 'open' and non-zero otherwise.

  int wrmsr(u_int32_t reg, u_int64_t data);
    // Return 0 if wrote specified 'data' into specified MSR 'reg' on the HW-core previously chosen by 'open' and
    // non-zero otherwise.

  int open(int cpu);
    // Return 0 if the MSR system file for specified 'cpu' was successfully opened. Class member 'd_fid' will hold
    // the file handle to it.
};

// FREE OPERATORS
std::ostream& operator<<(std::ostream& stream, const PMU& object);
  // Print into specified 'stream' human readable dump of 'object' returning 'stream'

// INLINE DEFINITIONS
// CREATORS
inline
PMU::PMU(ProgCounterSetConfig config)
: d_fid(-1)
, d_cnt(0)
, d_fcfg(DEFAULT_FIXED_CONFIG)
{
  assert(config>=0 && config<k_DEFAULT_CONFIG_UNDEFINED);

  pinToHWCore(sched_getcpu());

  d_fixedMnemonic.push_back("F0");
  d_fixedMnemonic.push_back("F1");
  d_fixedMnemonic.push_back("F2");

  d_fixedDescription.push_back("retired instructions");
  d_fixedDescription.push_back("no-halt cpu cycles");
  d_fixedDescription.push_back("reference no-halt cpu cycles");

  if (config==k_DEFAULT_XEON_CONFIG_0) {
    d_progMnemonic.push_back("P0");
    d_progMnemonic.push_back("P1");
    d_progMnemonic.push_back("P2");
    d_progMnemonic.push_back("P3");

    d_progDescription.push_back("LLC references");
    d_progDescription.push_back("LLC misses");
    d_progDescription.push_back("retired branch instructions");
    d_progDescription.push_back("retired branch instructions not taken");
  
    d_pcfg[0] = 0x414f2e;
    d_pcfg[1] = 0x41412e;
    d_pcfg[2] = 0x4104c4;
    d_pcfg[3] = 0x4110c4;

    // Four counters defined
    d_cnt = 4;
  }
}

inline
PMU::~PMU() {
  if (d_fid!=-1) {
    close(d_fid);
    d_fid = -1;
  }
}

// ACCESSORS
inline
int PMU::coreId() const {
  return sched_getcpu();
}

inline
u_int16_t PMU::fixedCountersDefined() const {
  return (u_int16_t)k_FIXED_COUNTERS;
}

inline
u_int16_t PMU::programmableCountersDefined() const {
  return d_cnt;
}

inline
u_int64_t PMU::timeStampCounter() const {
  u_int32_t hi, lo;
  __asm __volatile("mfence;lfence");                                                                                           
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ((u_int64_t)lo) | (((u_int64_t)hi)<<32);
}

inline
u_int64_t PMU::programmableCounterValue(u_int16_t c) const {
  assert(c<programmableCounterDefined());
  u_int64_t a,d;                                                                                                        
  // Finish pending instructions                                                                                        
  __asm __volatile("mfence;lfence");                                                                                           
  // https://www.felixcloutier.com/x86/rdpmc                                                                            
  // https://hjlebbink.github.io/x86doc/html/RDPMC.html                                                                 
  // ECX register: bit 30 <- 0 (programmable cntr) w/ low order bits counter# zero based                                       
  __asm __volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c));
  // Result is written into EAX lower 32-bits and rest of bits up to counter-width in EDX                               
  return ((d<<32)|a);
}

inline
u_int64_t PMU::fixedCounterValue(u_int16_t c) const {
  assert(c<fixedCountersDefined());
  u_int64_t a,d;                                                                                                        
  // Finish pending instructions                                                                                        
  __asm __volatile("mfence;lfence");                                                                                           
  // https://www.felixcloutier.com/x86/rdpmc                                                                            
  // https://hjlebbink.github.io/x86doc/html/RDPMC.html                                                                 
  // ECX register: bit 30 <- 1 (fixed counter) w/ low order bits counter# zero based                                       
  __asm __volatile("rdpmc" : "=a" (a), "=d" (d) : "c" ((1<<30)+c));
  // Result is written into EAX lower 32-bits and rest of bits up to counter-width in EDX                               
  return ((d<<32)|a);
}

inline
bool PMU::fixedCounterOverflowed(u_int16_t counter) const {
  assert(counter<fixedCountersDefined());
  u_int64_t overFlowStatus;                                                                                             
  auto object = const_cast<PMU*>(this);
  object->overflowStatus(&overFlowStatus);
  u_int64_t mask(FIXEDCTR0_OVERFLOW_MASK);
  for (u_int16_t i=0; i<=counter; ++i, mask<<=1);
  return (overFlowStatus & mask);
}

inline
bool PMU::programmableCounterOverflowed(u_int16_t counter) const {
  assert(counter<programmableCounterDefined());
  u_int64_t overFlowStatus;                                                                                             
  auto object = const_cast<PMU*>(this);
  object->overflowStatus(&overFlowStatus);
  u_int64_t mask(PMC0_OVERFLOW_MASK);
  for (u_int16_t i=0; i<=counter; ++i, mask<<=1);
  return (overFlowStatus & mask);
}

inline
const std::vector<std::string>& PMU::fixedMnemonic() const {
  return d_fixedMnemonic;
}

inline
const std::vector<std::string>& PMU::fixedDescription() const {
  return d_fixedDescription;
}

inline
const std::vector<std::string>& PMU::programmableMnemonic() const {
  return d_progMnemonic;
}

inline
const std::vector<std::string>& PMU::programmableDescription() const {
  return d_progDescription;
}

// MANIPULATORS
inline
int PMU::start() {
  assert(d_fid>0);

  int rc;

  // Enable all fixed counters (2nd enablement)
  if ((rc = wrmsr(IA32_FIXED_CTR_CTRL, d_fcfg))!=0) {
    return rc;
  }

  // Enable defined programmable counters (2nd enablement)
  int msr = IA32_PERFEVTSEL0;
  for(u_int16_t i = 0; i < d_cnt; ++i, ++msr) {
    if ((rc = wrmsr(msr, d_pcfg[i]))!=0) {
      return rc;
    }
  }

  return 0;
}

inline
int PMU::reset() {
  int rc;

  if (d_fid<0) {
    if ((rc = open(coreId()))!=0) {
      return rc;
    }
  }

  assert(d_fid>0);

  // Turn off all counters global level (1st disablement)
  if ((rc = wrmsr(IA32_PERF_GLOBAL_CTRL, 0))!=0) {
    return rc;
  }

  // Turn off all defined programmable counters (1st disablement)
  int msr = IA32_PERFEVTSEL0;
  for(u_int16_t i = 0; i < d_cnt; ++i, ++msr) {
    if ((rc = wrmsr(msr, 0))!=0) {
      return rc;
    }
  }

  // Turn off all fixed counters (1st disablement)
  if ((rc = wrmsr(IA32_FIXED_CTR_CTRL, 0))!=0) {
    return rc;
  }

  // Reset to 0 programmable counter values
  msr = IA32_PMC0;
  for(u_int16_t i = 0; i < d_cnt; ++i, ++msr) {
    if ((rc = wrmsr(msr, 0))!=0) {
      return rc;
    }
  }

  // Reset to 0 fixed counter values
  msr = IA32_FIXED_CTR0;
  for(u_int16_t i = 0; i < k_FIXED_COUNTERS; ++i, ++msr) {
    if ((rc = wrmsr(msr, 0))!=0) {
      return rc;
    }
  }

  // Clear overflow bits
  if ((rc = wrmsr(IA32_PERF_GLOBAL_STATUS_RESET, 0))!=0) {
    return rc;
  }

  // Re-enable all fixed and defined programmable counters (first enablement)
  u_int64_t value = 0x700000000; // 'doc/pmd.md' discusses this number in detail
  for(u_int16_t i = 0; i < d_cnt; ++i) {
    value |= (1<<i);
  }
  if ((rc = wrmsr(IA32_PERF_GLOBAL_CTRL, value))!=0) {
    return rc;
  }

  return 0;
}

inline
bool PMU::overflow() {
  bool flag(false);
  u_int64_t overFlowStatus;                                                                                             
  overflowStatus(&overFlowStatus);
  u_int64_t mask(FIXEDCTR0_OVERFLOW_MASK);
  for (u_int16_t i=0; i<fixedCountersDefined(); ++i, mask<<=1) {
    flag |= (overFlowStatus & mask);
  }
  mask = PMC0_OVERFLOW_MASK;
  for (u_int16_t i=0; i<programmableCountersDefined(); ++i, mask<<=1) {
    flag |= (overFlowStatus & mask);
  }
  return flag;
}


inline
int PMU::overflowStatus(u_int64_t *value) const {
  assert(value);

  int rc;
  auto object = const_cast<PMU*>(this);
  if ((rc = object->rdmsr(IA32_PERF_GLOBAL_STATUS, value))!=0) {
    return rc;
  }

  return 0;
}

inline
int PMU::rdmsr(u_int32_t reg, u_int64_t *value) {
  assert(d_fid>0);

  if (pread(d_fid, value, sizeof(u_int64_t), reg) != sizeof(u_int64_t)) {
    fprintf(stderr, "Error: MSR read error on register 0x%x %s\n", reg, strerror(errno));
    return errno;
  }

  // printf("rdmsr reg 0x%x val 0x%lx\n", reg, *data);

  return 0;
}

inline
int PMU::wrmsr(u_int32_t reg, u_int64_t data) {
  assert(d_fid>0);

  // printf("wrmsr reg 0x%x val 0x%lx\n", reg, data);

  if (pwrite(d_fid, &data, sizeof data, reg) != sizeof data) {
    fprintf(stderr, "Error: MSR write error on register 0x%x value 0x%lx: %s\n", reg, data, strerror(errno));
    return errno;
  }

  return 0;
}

inline
int PMU::open(int cpu) {
  assert(cpu>=0);
  assert(d_fid==-1);

  char msr_file_name[64];
  sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu);

  d_fid = ::open(msr_file_name, O_RDWR);
  if (d_fid < 0) {
    fprintf(stderr, "Error: cannot open '%s': %s\n", msr_file_name, strerror(errno));                                                 
    return errno;
  }

  return 0;
}

// FREE OPERATOR
// INLINE DEFINITIONS
inline
std::ostream& operator<<(std::ostream& stream, const PMU& object) {
  return object.print(stream);
}

} // namespace XEON
} // namespace Intel
